# -*- coding: utf-8 -*-
# @Date    : 2021/5/18
# @Author  : Maoxian

# 通过re 正则表达式爬取豆瓣250
import re
import time

import requests

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0'
}


def get_douban_250():
    top_250 = []
    for n in range(10):
        url = f'https://movie.douban.com/top250?start={25 * n}&filter='
        res = requests.get(url, headers=headers)
        # print(res.text)
        obj = re.compile(r'<div class="item">.*?'
                         r'<em class="">(?P<rank>.*?)</em>.*?'
                         r'<span class="title">(?P<title>.*?)</span>.*?'
                         r'<br>(?P<year>.*?)&nbsp;/&nbsp;(?P<country>.*?)&nbsp;/&nbsp;(?P<type>.*?)</p>.*?'
                         r'<span class="rating_num" property="v:average">(?P<rating>.*?)</span>.*?'
                         r'<span>(?P<rating_count>.*?)人评价</span>.*?' , re.S)

        it = obj.finditer(res.text)
        time.sleep(1)
        for i in it:
            dic = i.groupdict()
            dic['year'] = dic['year'].strip()
            dic['type'] = dic['type'].strip()
            top_250.append(dic)
            print(dic)
    data_to_json(top_250)
    data_to_csv(top_250)


def data_to_csv(data):  # 保存为csv
    import csv
    with open('data.csv', 'w', encoding='utf-8') as f:
        f_csv = csv.DictWriter(f, ['rank', 'title', 'year', 'country', 'type', 'rating', 'rating_count', 'inq'])
        f_csv.writeheader()
        for i in data:
            f_csv.writerow(i)


def data_to_json(data):  # 保存为json
    import json
    with open('data.json', 'a', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False)


if __name__ == '__main__':
    get_douban_250()
